The dataset is set to analyze the rate of unemployment
in India in specific locations and compare them to one another
OBJECTIVES;
1. Clean and thouroughly analyze the dataset
2. Analyze some important trends in the dataset.
3. Answer data driven questions and showcase answers using charts.
#import libraries and datasets
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
data=pd.read_csv(r"C:\Users\AISHAT\Desktop\PROJECTS\Cognorise\archive\Unemployment in India.csv")
data2=pd.read_csv(r"C:\Users\AISHAT\Desktop\PROJECTS\Cognorise\archive\Unemployment_Rate_upto_11_2020.csv")
data
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 765 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 766 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 767 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
768 rows × 7 columns
#column names
data.columns
Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
' Estimated Employed', ' Estimated Labour Participation Rate (%)',
'Area'],
dtype='object')
#remove spaces in column names
data.columns=data.columns.str.strip()
data.columns
Index(['Region', 'Date', 'Frequency', 'Estimated Unemployment Rate (%)',
'Estimated Employed', 'Estimated Labour Participation Rate (%)',
'Area'],
dtype='object')
data
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 765 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 766 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 767 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
768 rows × 7 columns
#data2
data2.head(10)
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Region.1 | longitude | latitude | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-01-2020 | M | 5.48 | 16635535 | 41.02 | South | 15.9129 | 79.74 |
| 1 | Andhra Pradesh | 29-02-2020 | M | 5.83 | 16545652 | 40.90 | South | 15.9129 | 79.74 |
| 2 | Andhra Pradesh | 31-03-2020 | M | 5.79 | 15881197 | 39.18 | South | 15.9129 | 79.74 |
| 3 | Andhra Pradesh | 30-04-2020 | M | 20.51 | 11336911 | 33.10 | South | 15.9129 | 79.74 |
| 4 | Andhra Pradesh | 31-05-2020 | M | 17.43 | 12988845 | 36.46 | South | 15.9129 | 79.74 |
| 5 | Andhra Pradesh | 30-06-2020 | M | 3.31 | 19805400 | 47.41 | South | 15.9129 | 79.74 |
| 6 | Andhra Pradesh | 31-07-2020 | M | 8.34 | 15431615 | 38.91 | South | 15.9129 | 79.74 |
| 7 | Andhra Pradesh | 31-08-2020 | M | 6.96 | 15251776 | 37.83 | South | 15.9129 | 79.74 |
| 8 | Andhra Pradesh | 30-09-2020 | M | 6.40 | 15220312 | 37.47 | South | 15.9129 | 79.74 |
| 9 | Andhra Pradesh | 31-10-2020 | M | 6.59 | 15157557 | 37.34 | South | 15.9129 | 79.74 |
data2.columns
Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
' Estimated Employed', ' Estimated Labour Participation Rate (%)',
'Region.1', 'longitude', 'latitude'],
dtype='object')
#removing spaces in column names of table2
data2.columns=data2.columns.str.strip()
data2.columns
Index(['Region', 'Date', 'Frequency', 'Estimated Unemployment Rate (%)',
'Estimated Employed', 'Estimated Labour Participation Rate (%)',
'Region.1', 'longitude', 'latitude'],
dtype='object')
#checking for the region names
data2["Region"].unique()
array(['Andhra Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Delhi', 'Goa',
'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu & Kashmir',
'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh',
'Maharashtra', 'Meghalaya', 'Odisha', 'Puducherry', 'Punjab',
'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
'Uttar Pradesh', 'Uttarakhand', 'West Bengal'], dtype=object)
data2["Frequency"].unique()
array([' M'], dtype=object)
data2["Frequency"]=data2["Frequency"].str.strip()
data2["Frequency"]
0 M
1 M
2 M
3 M
4 M
..
262 M
263 M
264 M
265 M
266 M
Name: Frequency, Length: 267, dtype: object
data2
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Region.1 | longitude | latitude | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-01-2020 | M | 5.48 | 16635535 | 41.02 | South | 15.9129 | 79.740 |
| 1 | Andhra Pradesh | 29-02-2020 | M | 5.83 | 16545652 | 40.90 | South | 15.9129 | 79.740 |
| 2 | Andhra Pradesh | 31-03-2020 | M | 5.79 | 15881197 | 39.18 | South | 15.9129 | 79.740 |
| 3 | Andhra Pradesh | 30-04-2020 | M | 20.51 | 11336911 | 33.10 | South | 15.9129 | 79.740 |
| 4 | Andhra Pradesh | 31-05-2020 | M | 17.43 | 12988845 | 36.46 | South | 15.9129 | 79.740 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 262 | West Bengal | 30-06-2020 | M | 7.29 | 30726310 | 40.39 | East | 22.9868 | 87.855 |
| 263 | West Bengal | 31-07-2020 | M | 6.83 | 35372506 | 46.17 | East | 22.9868 | 87.855 |
| 264 | West Bengal | 31-08-2020 | M | 14.87 | 33298644 | 47.48 | East | 22.9868 | 87.855 |
| 265 | West Bengal | 30-09-2020 | M | 9.35 | 35707239 | 47.73 | East | 22.9868 | 87.855 |
| 266 | West Bengal | 31-10-2020 | M | 9.98 | 33962549 | 45.63 | East | 22.9868 | 87.855 |
267 rows × 9 columns
data2["Date"].unique()
array([' 31-01-2020', ' 29-02-2020', ' 31-03-2020', ' 30-04-2020',
' 31-05-2020', ' 30-06-2020', ' 31-07-2020', ' 31-08-2020',
' 30-09-2020', ' 31-10-2020'], dtype=object)
data["Date"].unique()
array([' 31-05-2019', ' 30-06-2019', ' 31-07-2019', ' 31-08-2019',
' 30-09-2019', ' 31-10-2019', ' 30-11-2019', ' 31-12-2019',
' 31-01-2020', ' 29-02-2020', ' 31-03-2020', ' 30-04-2020',
' 31-05-2020', ' 30-06-2020', nan], dtype=object)
#number of regions present
data["Region"].value_counts()
Region Andhra Pradesh 28 Kerala 28 West Bengal 28 Uttar Pradesh 28 Tripura 28 Telangana 28 Tamil Nadu 28 Rajasthan 28 Punjab 28 Odisha 28 Madhya Pradesh 28 Maharashtra 28 Karnataka 28 Jharkhand 28 Himachal Pradesh 28 Haryana 28 Gujarat 28 Delhi 28 Chhattisgarh 28 Bihar 28 Meghalaya 27 Uttarakhand 27 Assam 26 Puducherry 26 Goa 24 Jammu & Kashmir 21 Sikkim 17 Chandigarh 12 Name: count, dtype: int64
#checking the most common number of Regions present(the mode)
data["Region"].value_counts().mode()
0 28 Name: count, dtype: int64
#create a new dataset by combining necessary columns from both datasets
region=data["Region"].unique().tolist()
date=[]
frequency=[]
estimated_unemployment_rate=[]
estimated_employed=[]
estimated_labour_participation_rate=[]
area=[]
for i in region:
frequency.append((data.loc[data["Region"]== i, "Frequency"]))
estimated_unemployment_rate.append((data2.loc[data2["Region"]== i, "Estimated Unemployment Rate (%)"]).sum())
estimated_employed.append((data2.loc[data2["Region"]== i, "Estimated Employed"]).sum())
estimated_labour_participation_rate.append((data.loc[data["Region"]== i, "Estimated Labour Participation Rate (%)"]).sum()/28)
aggregated_data = pd.DataFrame(list(zip(region, frequency, estimated_unemployment_rate, estimated_employed, estimated_labour_participation_rate)),
columns=["Region", "Frequency", "Estimated Unemployment Rate (%)", "Estimated Employed",
"Estimated Labour Participation Rate (%)"])
aggregated_data.head()
| Region | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | |
|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 0 Monthly 1 Monthly 2 Monthl... | 86.64 | 154254800 | 39.375714 |
| 1 | Assam | 14 Monthly 15 Monthly 16 Monthl... | 48.56 | 108102755 | 41.663571 |
| 2 | Bihar | 26 Monthly 27 Monthly 28 Monthl... | 194.71 | 236068280 | 38.153929 |
| 3 | Chhattisgarh | 40 Monthly 41 Monthly 42 Monthl... | 78.19 | 84213492 | 42.810714 |
| 4 | Delhi | 54 Monthly 55 Monthly 56 Monthl... | 184.14 | 46328219 | 38.929643 |
#sorting data according to Estimated Unemployment Rate
datas = aggregated_data.sort_values(by=["Estimated Unemployment Rate (%)"], ascending=False)
datas.head()
| Region | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | |
|---|---|---|---|---|---|
| 7 | Haryana | 94 Monthly 95 Monthly 96 Monthl... | 274.77 | 68440590 | 42.737143 |
| 23 | Tripura | 304 Monthly 305 Monthly 306 Monthl... | 250.55 | 13972916 | 61.823929 |
| 10 | Jharkhand | 133 Monthly 134 Monthly 135 Monthl... | 195.39 | 87706424 | 41.670714 |
| 2 | Bihar | 26 Monthly 27 Monthly 28 Monthl... | 194.71 | 236068280 | 38.153929 |
| 4 | Delhi | 54 Monthly 55 Monthly 56 Monthl... | 184.14 | 46328219 | 38.929643 |
#Top 10 regions with the highest Estimated Unemployment Rate
datas=datas.head(10)
datas
| Region | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | |
|---|---|---|---|---|---|
| 7 | Haryana | 94 Monthly 95 Monthly 96 Monthl... | 274.77 | 68440590 | 42.737143 |
| 23 | Tripura | 304 Monthly 305 Monthly 306 Monthl... | 250.55 | 13972916 | 61.823929 |
| 10 | Jharkhand | 133 Monthly 134 Monthly 135 Monthl... | 195.39 | 87706424 | 41.670714 |
| 2 | Bihar | 26 Monthly 27 Monthly 28 Monthl... | 194.71 | 236068280 | 38.153929 |
| 4 | Delhi | 54 Monthly 55 Monthly 56 Monthl... | 184.14 | 46328219 | 38.929643 |
| 17 | Puducherry | 231 Monthly 232 Monthly 233 Monthl... | 179.42 | 3652629 | 36.207500 |
| 8 | Himachal Pradesh | 108 Monthly 109 Monthly 110 Monthl... | 160.65 | 20338849 | 44.222143 |
| 19 | Rajasthan | 257 Monthly 258 Monthly 259 Monthl... | 158.68 | 197317522 | 39.973214 |
| 9 | Jammu & Kashmir | 122 Monthly 123 Monthly 124 Monthl... | 148.30 | 29790285 | 30.773214 |
| 21 | Tamil Nadu | 276 Monthly 277 Monthly 278 Monthl... | 121.87 | 219878981 | 40.872143 |
#total estimated employed
datas['Estimated Employed'].sum()
923494695
#Estimated employment rate of countries with highest unemployment rates
figure = px.bar(datas, y='Estimated Employed', x='Region', title='Estimated employment rate of countries with highest unemployment rates')
figure.show()
#Regions with the highest Unemployment cases
figure = px.bar(datas, y='Estimated Unemployment Rate (%)', x='Region', title='Regions with the highest Unemployment cases')
figure.show()
#Estimated Labour Participation Rate of regions with highest unemploment rates
figure = px.bar(datas, y='Estimated Labour Participation Rate (%)', x='Region', title='Estimated Labour Participation Rate of regions with highest unemploment rates')
figure.show()